0. Read data

setwd('/Users/mandyhong/Desktop/DA401')
allHC = read.csv('all_race_hc_ts.csv')
blackHC=read.csv('black_hc_ts.csv')
asianHC=read.csv('asian_hc_ts.csv')

DF = read.csv('census_w_hc_five_agg.csv')
summary(DF)
##       YEAR      MEDIAN_INCOME    MEAN_INCOME          POP          
##  Min.   :2018   Min.   :43539   Min.   : 60672   Min.   :  582087  
##  1st Qu.:2018   1st Qu.:53334   1st Qu.: 72177   1st Qu.: 1846425  
##  Median :2018   Median :58671   Median : 77483   Median : 4548930  
##  Mean   :2018   Mean   :60177   Mean   : 81477   Mean   : 6441289  
##  3rd Qu.:2018   3rd Qu.:67724   3rd Qu.: 89860   3rd Qu.: 7203582  
##  Max.   :2018   Max.   :81744   Max.   :110050   Max.   :39083067  
##   ONE_RACE_POP          WHITE              BLACK             ASIAN        
##  Min.   :  565154   Min.   :  352867   Min.   :   4910   Min.   :   5032  
##  1st Qu.: 1807184   1st Qu.: 1565282   1st Qu.:  66353   1st Qu.:  38463  
##  Median : 4449047   Median : 3321945   Median : 347392   Median : 115148  
##  Mean   : 6210087   Mean   : 4667506   Mean   : 810521   Mean   : 350355  
##  3rd Qu.: 6806593   3rd Qu.: 5535156   3rd Qu.:1363354   3rd Qu.: 355764  
##  Max.   :36973311   Max.   :23264767   Max.   :3353300   Max.   :5597871  
##  FOREIGN_BORN_NON_US POVERTY_PERCENT UNEMPLOYMENT_RATE SCHOOL_ENROLLMENT 
##  Min.   :  10194     Min.   : 7.90   Min.   :2.840     Min.   :  146042  
##  1st Qu.:  69961     1st Qu.:11.13   1st Qu.:4.770     1st Qu.:  464009  
##  Median : 153082     Median :13.59   Median :5.910     Median : 1112828  
##  Mean   : 441622     Mean   :13.54   Mean   :5.716     Mean   : 1624249  
##  3rd Qu.: 431093     3rd Qu.:15.41   3rd Qu.:6.660     3rd Qu.: 1753807  
##  Max.   :5151904     Max.   :20.90   Max.   :8.240     Max.   :10458700  
##  X18_24_BACHELOR_HIGHER X25_34_BACHELOR_HIGHER X35_44_BACHELOR_HIGHER
##  Min.   :  3840         Min.   :  21763        Min.   :  21045       
##  1st Qu.: 14126         1st Qu.:  65052        1st Qu.:  65662       
##  Median : 35836         Median : 183529        Median : 177531       
##  Mean   : 66872         Mean   : 310356        Mean   : 289945       
##  3rd Qu.: 87784         3rd Qu.: 405686        3rd Qu.: 387764       
##  Max.   :398819         Max.   :2092631        Max.   :1870672       
##  X45_64_BACHELOR_HIGHER X65_MORE_BACHELOR_HIGHER WHITE_PER_CAPITA
##  Min.   :  39291        Min.   :  23273          Min.   :24.85   
##  1st Qu.: 140207        1st Qu.:  74472          1st Qu.:68.30   
##  Median : 325318        Median : 178468          Median :77.89   
##  Mean   : 510497        Mean   : 262811          Mean   :76.45   
##  3rd Qu.: 725982        3rd Qu.: 346725          3rd Qu.:85.22   
##  Max.   :3113981        Max.   :1640027          Max.   :94.38   
##  BLACK_PER_CAPITA  ASIAN_PER_CAPITA  ONE_RACE_RATIO  FOREIGN_BORN_NON_US_RATIO
##  Min.   : 0.4708   Min.   : 0.7638   Min.   :76.03   Min.   : 0.796           
##  1st Qu.: 3.3315   1st Qu.: 1.5428   1st Qu.:96.50   1st Qu.: 2.498           
##  Median : 7.3649   Median : 2.7109   Median :96.99   Median : 3.928           
##  Mean   :10.5345   Mean   : 4.1741   Mean   :96.29   Mean   : 4.786           
##  3rd Qu.:15.0909   3rd Qu.: 4.6028   3rd Qu.:97.43   3rd Qu.: 6.780           
##  Max.   :37.6340   Max.   :37.8426   Max.   :98.63   Max.   :13.184           
##  SCHOOL_ENROLLMENT_RATE ALL_HATE_CRIME    BLACK_HATE_CRIME ASIAN_HATE_CRIME
##  Min.   :21.49          Min.   :   6.00   Min.   :  2.40   Min.   : 0.20   
##  1st Qu.:24.03          1st Qu.:  28.25   1st Qu.: 10.90   1st Qu.: 0.80   
##  Median :24.95          Median :  67.50   Median : 26.00   Median : 2.70   
##  Mean   :24.89          Mean   : 147.22   Mean   : 54.83   Mean   : 6.28   
##  3rd Qu.:25.52          3rd Qu.: 163.75   3rd Qu.: 56.70   3rd Qu.: 7.30   
##  Max.   :31.90          Max.   :1086.60   Max.   :454.40   Max.   :46.40   
##  ALL_HC_PER_CAPITA BLACK_HC_PER_CAPITA_POP BLACK_HC_PER_CAPITA_RACE_POP
##  Min.   : 0.3255   Min.   : 0.0951         Min.   :   0.6937           
##  1st Qu.: 1.3407   1st Qu.: 0.4776         1st Qu.:  17.5300           
##  Median : 2.4570   Median : 0.9124         Median :  47.4428           
##  Mean   : 6.2577   Mean   : 2.2650         Mean   :  94.6701           
##  3rd Qu.: 6.8685   3rd Qu.: 2.2786         3rd Qu.:  87.9626           
##  Max.   :51.5159   Max.   :19.8888         Max.   :1732.7170           
##  ASIAN_HC_PER_CAPITA_POP ASIAN_HC_PER_CAPITA_RACE_POP BACHELOR_RATE  
##  Min.   :0.002662        Min.   :  0.0302             Min.   :14.99  
##  1st Qu.:0.024540        1st Qu.:  0.9113             1st Qu.:19.25  
##  Median :0.081968        Median :  3.9117             Median :21.18  
##  Mean   :0.254118        Mean   : 15.1502             Mean   :21.80  
##  3rd Qu.:0.241591        3rd Qu.: 11.7414             3rd Qu.:24.25  
##  Max.   :3.001644        Max.   :339.4914             Max.   :31.67
DF2=DF[c("MEDIAN_INCOME","POP",  "WHITE_PER_CAPITA", "BLACK_PER_CAPITA", "ASIAN_PER_CAPITA","FOREIGN_BORN_NON_US_RATIO", "POVERTY_PERCENT", "UNEMPLOYMENT_RATE", "SCHOOL_ENROLLMENT_RATE", "BACHELOR_RATE","ALL_HC_PER_CAPITA", "BLACK_HC_PER_CAPITA_POP", "ASIAN_HC_PER_CAPITA_POP")]
summary(DF2)
##  MEDIAN_INCOME        POP           WHITE_PER_CAPITA BLACK_PER_CAPITA 
##  Min.   :43539   Min.   :  582087   Min.   :24.85    Min.   : 0.4708  
##  1st Qu.:53334   1st Qu.: 1846425   1st Qu.:68.30    1st Qu.: 3.3315  
##  Median :58671   Median : 4548930   Median :77.89    Median : 7.3649  
##  Mean   :60177   Mean   : 6441289   Mean   :76.45    Mean   :10.5345  
##  3rd Qu.:67724   3rd Qu.: 7203582   3rd Qu.:85.22    3rd Qu.:15.0909  
##  Max.   :81744   Max.   :39083067   Max.   :94.38    Max.   :37.6340  
##  ASIAN_PER_CAPITA  FOREIGN_BORN_NON_US_RATIO POVERTY_PERCENT UNEMPLOYMENT_RATE
##  Min.   : 0.7638   Min.   : 0.796            Min.   : 7.90   Min.   :2.840    
##  1st Qu.: 1.5428   1st Qu.: 2.498            1st Qu.:11.13   1st Qu.:4.770    
##  Median : 2.7109   Median : 3.928            Median :13.59   Median :5.910    
##  Mean   : 4.1741   Mean   : 4.786            Mean   :13.54   Mean   :5.716    
##  3rd Qu.: 4.6028   3rd Qu.: 6.780            3rd Qu.:15.41   3rd Qu.:6.660    
##  Max.   :37.8426   Max.   :13.184            Max.   :20.90   Max.   :8.240    
##  SCHOOL_ENROLLMENT_RATE BACHELOR_RATE   ALL_HC_PER_CAPITA
##  Min.   :21.49          Min.   :14.99   Min.   : 0.3255  
##  1st Qu.:24.03          1st Qu.:19.25   1st Qu.: 1.3407  
##  Median :24.95          Median :21.18   Median : 2.4570  
##  Mean   :24.89          Mean   :21.80   Mean   : 6.2577  
##  3rd Qu.:25.52          3rd Qu.:24.25   3rd Qu.: 6.8685  
##  Max.   :31.90          Max.   :31.67   Max.   :51.5159  
##  BLACK_HC_PER_CAPITA_POP ASIAN_HC_PER_CAPITA_POP
##  Min.   : 0.0951         Min.   :0.002662       
##  1st Qu.: 0.4776         1st Qu.:0.024540       
##  Median : 0.9124         Median :0.081968       
##  Mean   : 2.2650         Mean   :0.254118       
##  3rd Qu.: 2.2786         3rd Qu.:0.241591       
##  Max.   :19.8888         Max.   :3.001644
sum(is.na(DF2)==TRUE)
## [1] 0
write.csv(DF2,'/Users/mandyhong/Desktop/DA401/census_w_hc_FINAL.csv', row.names = FALSE)

1. Descriptive Analysis

1.1. Get time series plots for all three data

#1. tsplot
tsplot(allHC$hate_crime)

tsplot(blackHC$hate_crime)

tsplot(asianHC$hate_crime)

culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(allHC$hate_crime, col=culer[1], lwd=2,  pch=20, ylim=c(min(asianHC$hate_crime), max(allHC$hate_crime))
,ylab="Hate crimes", main="Racial Hate Crimes")
lines(blackHC$hate_crime, col=culer[2], lwd=2, pch=20)
lines(asianHC$hate_crime, col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white") 

culer = c(rgb(.85,.30,.12,.6), rgb(.12,.65,.85,.6), "aquamarine3")
tsplot(log(allHC$hate_crime), col=culer[1], lwd=2,  pch=20, ylim=c(min(log(asianHC$hate_crime)), max(log(allHC$hate_crime)+1))
,ylab="log(Hate crimes)", main="Racial Hate Crimes")
lines(log(blackHC$hate_crime), col=culer[2], lwd=2, pch=20)
lines(log(asianHC$hate_crime), col=culer[3], lwd=2, pch=20)
legend("topleft", col=culer, lty=1, lwd=2, pch=20, legend=c("All race", "African American/Black", "Asian"), bg="white") 

#2. time series plot with date in x-axis
ts1 <- xts(allHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", allHC$year, allHC$month)))
ts2 <- xts(allHC$hate_crime, as.yearmon(allHC$year + (allHC$month-1)/12))
plot(ts2, main="All Racial Hate Crimes")

ts3 <- xts(blackHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", blackHC$year, blackHC$month)))
ts4 <- xts(blackHC$hate_crime, as.yearmon(blackHC$year + (blackHC$month-1)/12))
plot(ts4, main="Anti-African American or Black Hate Crimes")

ts5<- xts(asianHC$hate_crime, as.POSIXct(sprintf("%d-%d-01", asianHC$year, asianHC$month)))
ts6 <- xts(asianHC$hate_crime, as.yearmon(asianHC$year + (asianHC$month-1)/12))
plot(ts6, main="Anti-Asian Hate Crimes")

1.2. Summary statistics and distribution

summary(allHC)
##       year          month         hate_crime    
##  Min.   :1991   Min.   : 1.00   Min.   : 198.0  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.: 319.0  
##  Median :2006   Median : 6.50   Median : 379.0  
##  Mean   :2006   Mean   : 6.50   Mean   : 389.1  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.: 448.0  
##  Max.   :2020   Max.   :12.00   Max.   :1329.0
sd(allHC$hate_crime)
## [1] 112.6868
summary(blackHC)
##       year          month         hate_crime   
##  Min.   :1991   Min.   : 1.00   Min.   : 88.0  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.:167.0  
##  Median :2006   Median : 6.50   Median :202.0  
##  Mean   :2006   Mean   : 6.50   Mean   :207.7  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:242.5  
##  Max.   :2020   Max.   :12.00   Max.   :693.0
sd(blackHC$hate_crime)
## [1] 61.32818
summary(asianHC)
##       year          month         hate_crime   
##  Min.   :1991   Min.   : 1.00   Min.   : 3.00  
##  1st Qu.:1998   1st Qu.: 3.75   1st Qu.:12.00  
##  Median :2006   Median : 6.50   Median :17.00  
##  Mean   :2006   Mean   : 6.50   Mean   :17.84  
##  3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:23.00  
##  Max.   :2020   Max.   :12.00   Max.   :51.00
sd(asianHC$hate_crime)
## [1] 8.30969
#distribution
hist(allHC$hate_crime, col=culer[1], main="Distribution of All Racial Hate Crimes", xlab="Number of Hate Crimes")

hist(blackHC$hate_crime, col=culer[2], main="Distribution of Anti-Black Racial Hate Crimes", xlab="Number of Hate Crimes")

hist(asianHC$hate_crime, col=culer[3], main="Distribution of Anti-Asian Racial Hate Crimes", xlab="Number of Hate Crimes")

1.3. Check and describe outliers

allHC$difference<-c(0,diff(allHC$hate_crime))
iqr = IQR(diff(allHC$hate_crime))
Q <- quantile(allHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <-  Q[2]+1.5*iqr 
low <- Q[1]-1.5*iqr 
tsplot(allHC$difference, main="Detecting Outliers Using IQR Score: All Racial Hate Crimes", ylab="Differenced(hate crime)")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

blackHC$difference<-c(0,diff(blackHC$hate_crime))
iqr = IQR(diff(blackHC$hate_crime))
Q <- quantile(blackHC$difference, probs=c(.25, .75), na.rm = FALSE)
#Qtest<-quantile(blackHC$difference)
#Qtest[4] #3rd quntile
high <-  Q[2]+1.5*iqr 
low <- Q[1]-1.5*iqr 
tsplot(blackHC$difference,main="Detecting Outliers Using IQR Score: Anti-Black Hate Crimes", ylab ="Differenced(hate crime)" )
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

#blackHC[blackHC$difference > high, ]
#blackHC[blackHC$difference < low, ]
#outliers=c(which(blackHC$difference > high), which(blackHC$difference < low)) #12 outliers
#blackHC_no_outliers=blackHC[-outliers,]
#tsplot(blackHC_no_outliers$difference) #getting rid of outliers look stationary. In this case can we exclude outliers?
#tsplot(diff(blackHC_no_outliers$hate_crime)) #why do this plot and the plot above looks different?

#check for the outliers
asianHC$difference<-c(0,diff(asianHC$hate_crime))
iqr = IQR(diff(asianHC$hate_crime))
Q <- quantile(asianHC$difference, probs=c(.25, .75), na.rm = FALSE)
high <-  Q[2]+1.5*iqr 
low <- Q[1]-1.5*iqr 
tsplot(asianHC$difference, main="Detecting Outliers Using IQR Score: Anti-Asian Hate Crimes", ylab="Differenced(hate crime)")
abline(a = high, 0, lty = 2, col = 'red')
abline(a = low, 0, lty = 2, col = 'red')

#asianHC[asianHC$difference > high, ]
#asianHC[asianHC$difference < low, ]
#outliers=c(which(asianHC$difference > high), which(asianHC$difference < low)) #8 outliers
#asianHC_no_outliers=asianHC[-outliers,]

2. Predictive Analysis: stepwise multiple regression

set.seed(123)
#0. create train and test data
indices = sample(2, nrow(DF2), replace = TRUE, prob=c(0.8,0.2)) #80% train data
train = DF2[indices == 1,]
test = DF2[indices == 2,]
plot(train)

#1. all racial hate crime
mod_all <- lm(ALL_HC_PER_CAPITA ~ . , data=DF2)
summary(mod_all) #0.9796
## 
## Call:
## lm(formula = ALL_HC_PER_CAPITA ~ ., data = DF2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4819 -0.6689 -0.0805  0.4016  4.9962 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.285e+01  1.093e+01  -1.176    0.247    
## MEDIAN_INCOME              3.147e-05  8.202e-05   0.384    0.703    
## POP                       -5.437e-08  4.100e-08  -1.326    0.193    
## WHITE_PER_CAPITA           5.361e-02  6.463e-02   0.829    0.412    
## BLACK_PER_CAPITA           5.489e-02  5.012e-02   1.095    0.280    
## ASIAN_PER_CAPITA           8.057e-02  1.149e-01   0.701    0.487    
## FOREIGN_BORN_NON_US_RATIO  2.258e-02  1.388e-01   0.163    0.872    
## POVERTY_PERCENT            1.861e-01  2.527e-01   0.736    0.466    
## UNEMPLOYMENT_RATE          3.379e-02  3.034e-01   0.111    0.912    
## SCHOOL_ENROLLMENT_RATE     8.111e-02  1.557e-01   0.521    0.605    
## BACHELOR_RATE              8.163e-02  9.918e-02   0.823    0.416    
## BLACK_HC_PER_CAPITA_POP    2.778e+00  1.688e-01  16.459   <2e-16 ***
## ASIAN_HC_PER_CAPITA_POP   -1.433e+00  1.119e+00  -1.281    0.208    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.279 on 37 degrees of freedom
## Multiple R-squared:  0.9859, Adjusted R-squared:  0.9814 
## F-statistic: 216.1 on 12 and 37 DF,  p-value: < 2.2e-16
vif(mod_all)
##             MEDIAN_INCOME                       POP          WHITE_PER_CAPITA 
##                 19.605504                  2.647671                 20.438757 
##          BLACK_PER_CAPITA          ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO 
##                  6.828188                 12.186095                  4.732608 
##           POVERTY_PERCENT         UNEMPLOYMENT_RATE    SCHOOL_ENROLLMENT_RATE 
##                 16.153629                  3.965255                  1.915081 
##             BACHELOR_RATE   BLACK_HC_PER_CAPITA_POP   ASIAN_HC_PER_CAPITA_POP 
##                  4.598925                 10.937433                 10.043324
mod_all_back=step(mod_all, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_all_back)

summary(mod_all_back)
## 
## Call:
## lm(formula = ALL_HC_PER_CAPITA ~ BLACK_HC_PER_CAPITA_POP, data = DF2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4573 -0.4753 -0.2354  0.0896  5.4445 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.37784    0.20497   1.843   0.0714 .  
## BLACK_HC_PER_CAPITA_POP  2.59601    0.04874  53.262   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.221 on 48 degrees of freedom
## Multiple R-squared:  0.9834, Adjusted R-squared:  0.983 
## F-statistic:  2837 on 1 and 48 DF,  p-value: < 2.2e-16
#vif(mod_all_back)

#model without hate crime variables
mod_all2 <- lm(ALL_HC_PER_CAPITA ~ . -BLACK_HC_PER_CAPITA_POP -ASIAN_HC_PER_CAPITA_POP, data=DF2)
summary(mod_all2) #0.9796
## 
## Call:
## lm(formula = ALL_HC_PER_CAPITA ~ . - BLACK_HC_PER_CAPITA_POP - 
##     ASIAN_HC_PER_CAPITA_POP, data = DF2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.497 -4.023 -0.849  1.866 31.907 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               -1.671e+02  6.050e+01  -2.761  0.00873 **
## MEDIAN_INCOME              9.784e-04  4.693e-04   2.085  0.04370 * 
## POP                        2.214e-07  2.333e-07   0.949  0.34848   
## WHITE_PER_CAPITA           8.272e-01  3.641e-01   2.272  0.02867 * 
## BLACK_PER_CAPITA           4.926e-01  2.925e-01   1.684  0.10014   
## ASIAN_PER_CAPITA           1.128e+00  6.568e-01   1.717  0.09383 . 
## FOREIGN_BORN_NON_US_RATIO  6.640e-01  8.027e-01   0.827  0.41315   
## POVERTY_PERCENT            3.247e+00  1.438e+00   2.258  0.02959 * 
## UNEMPLOYMENT_RATE          1.202e-01  1.802e+00   0.067  0.94718   
## SCHOOL_ENROLLMENT_RATE    -4.540e-01  9.282e-01  -0.489  0.62751   
## BACHELOR_RATE              1.538e-01  5.960e-01   0.258  0.79777   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.701 on 39 degrees of freedom
## Multiple R-squared:  0.4624, Adjusted R-squared:  0.3245 
## F-statistic: 3.354 on 10 and 39 DF,  p-value: 0.003123
vif(mod_all2)
##             MEDIAN_INCOME                       POP          WHITE_PER_CAPITA 
##                 17.704275                  2.364849                 17.888133 
##          BLACK_PER_CAPITA          ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO 
##                  6.414012                 10.986843                  4.365748 
##           POVERTY_PERCENT         UNEMPLOYMENT_RATE    SCHOOL_ENROLLMENT_RATE 
##                 14.419097                  3.858263                  1.877929 
##             BACHELOR_RATE 
##                  4.579832
mod_all_back2=step(mod_all2, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_all_back2)

summary(mod_all_back2)
## 
## Call:
## lm(formula = ALL_HC_PER_CAPITA ~ MEDIAN_INCOME + WHITE_PER_CAPITA + 
##     BLACK_PER_CAPITA + ASIAN_PER_CAPITA + FOREIGN_BORN_NON_US_RATIO + 
##     POVERTY_PERCENT, data = DF2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.117 -4.083 -1.195  1.683 30.704 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.996e+02  5.093e+01  -3.919 0.000314 ***
## MEDIAN_INCOME              1.037e-03  3.369e-04   3.077 0.003633 ** 
## WHITE_PER_CAPITA           1.014e+00  3.012e-01   3.367 0.001610 ** 
## BLACK_PER_CAPITA           6.411e-01  2.412e-01   2.658 0.011000 *  
## ASIAN_PER_CAPITA           1.438e+00  5.553e-01   2.590 0.013043 *  
## FOREIGN_BORN_NON_US_RATIO  1.100e+00  5.407e-01   2.034 0.048106 *  
## POVERTY_PERCENT            3.541e+00  1.103e+00   3.211 0.002503 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.462 on 43 degrees of freedom
## Multiple R-squared:  0.4435, Adjusted R-squared:  0.3658 
## F-statistic:  5.71 on 6 and 43 DF,  p-value: 0.0001961
vif(mod_all_back2)
##             MEDIAN_INCOME          WHITE_PER_CAPITA          BLACK_PER_CAPITA 
##                  9.714941                 13.039657                  4.646942 
##          ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO           POVERTY_PERCENT 
##                  8.362728                  2.109388                  9.032896
#2. anti-black hate crime
mod_black <- lm(BLACK_HC_PER_CAPITA_POP ~ . , data=train)
summary(mod_black)
## 
## Call:
## lm(formula = BLACK_HC_PER_CAPITA_POP ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.46460 -0.15708  0.00272  0.19000  0.85507 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.645e+00  4.422e+00   0.824   0.4173    
## MEDIAN_INCOME              3.169e-06  3.485e-05   0.091   0.9282    
## POP                        6.587e-09  1.908e-08   0.345   0.7327    
## WHITE_PER_CAPITA           3.184e-03  2.491e-02   0.128   0.8993    
## BLACK_PER_CAPITA           7.076e-03  2.255e-02   0.314   0.7562    
## ASIAN_PER_CAPITA          -6.421e-02  9.880e-02  -0.650   0.5215    
## FOREIGN_BORN_NON_US_RATIO  3.076e-02  6.323e-02   0.487   0.6306    
## POVERTY_PERCENT           -1.086e-01  1.102e-01  -0.985   0.3338    
## UNEMPLOYMENT_RATE          2.343e-02  1.152e-01   0.203   0.8404    
## SCHOOL_ENROLLMENT_RATE    -6.872e-02  6.388e-02  -1.076   0.2919    
## BACHELOR_RATE             -5.457e-02  4.405e-02  -1.239   0.2265    
## ALL_HC_PER_CAPITA          3.287e-01  2.184e-02  15.050 2.38e-14 ***
## ASIAN_HC_PER_CAPITA_POP    1.007e+00  3.835e-01   2.624   0.0143 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4413 on 26 degrees of freedom
## Multiple R-squared:  0.9886, Adjusted R-squared:  0.9834 
## F-statistic: 188.3 on 12 and 26 DF,  p-value: < 2.2e-16
vif(mod_black)
##             MEDIAN_INCOME                       POP          WHITE_PER_CAPITA 
##                 17.682850                  2.216319                 11.184198 
##          BLACK_PER_CAPITA          ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO 
##                  7.437974                  8.553221                  5.231084 
##           POVERTY_PERCENT         UNEMPLOYMENT_RATE    SCHOOL_ENROLLMENT_RATE 
##                 15.839536                  3.760373                  2.392602 
##             BACHELOR_RATE         ALL_HC_PER_CAPITA   ASIAN_HC_PER_CAPITA_POP 
##                  4.918854                  7.777060                  6.736593
mod_black_back=step(mod_black, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_black_back)

summary(mod_black_back)
## 
## Call:
## lm(formula = BLACK_HC_PER_CAPITA_POP ~ ALL_HC_PER_CAPITA + ASIAN_HC_PER_CAPITA_POP, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52642 -0.08406  0.02117  0.10565  0.97951 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.04600    0.07958  -0.578  0.56686    
## ALL_HC_PER_CAPITA        0.31949    0.01660  19.241  < 2e-16 ***
## ASIAN_HC_PER_CAPITA_POP  1.07965    0.31329   3.446  0.00146 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4014 on 36 degrees of freedom
## Multiple R-squared:  0.987,  Adjusted R-squared:  0.9862 
## F-statistic:  1364 on 2 and 36 DF,  p-value: < 2.2e-16
vif(mod_black_back)
##       ALL_HC_PER_CAPITA ASIAN_HC_PER_CAPITA_POP 
##                 5.43352                 5.43352
#model without hate crime variables
mod_black2 <- lm(BLACK_HC_PER_CAPITA_POP ~ . -ALL_HC_PER_CAPITA-ASIAN_HC_PER_CAPITA_POP, data=train)
summary(mod_black2)
## 
## Call:
## lm(formula = BLACK_HC_PER_CAPITA_POP ~ . - ALL_HC_PER_CAPITA - 
##     ASIAN_HC_PER_CAPITA_POP, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6105 -1.7486 -0.2017  0.8917 10.3159 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               -6.435e+01  2.551e+01  -2.523   0.0176 *
## MEDIAN_INCOME              3.095e-04  2.224e-04   1.391   0.1750  
## POP                        7.469e-08  1.229e-07   0.608   0.5483  
## WHITE_PER_CAPITA           3.062e-01  1.515e-01   2.021   0.0529 .
## BLACK_PER_CAPITA           1.942e-01  1.445e-01   1.344   0.1897  
## ASIAN_PER_CAPITA           1.416e+00  5.645e-01   2.508   0.0182 *
## FOREIGN_BORN_NON_US_RATIO -1.962e-01  3.940e-01  -0.498   0.6224  
## POVERTY_PERCENT            1.355e+00  6.672e-01   2.031   0.0518 .
## UNEMPLOYMENT_RATE         -3.618e-01  7.536e-01  -0.480   0.6348  
## SCHOOL_ENROLLMENT_RATE    -1.152e-02  4.136e-01  -0.028   0.9780  
## BACHELOR_RATE              9.194e-02  2.876e-01   0.320   0.7516  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.915 on 28 degrees of freedom
## Multiple R-squared:  0.4654, Adjusted R-squared:  0.2745 
## F-statistic: 2.438 on 10 and 28 DF,  p-value: 0.0309
vif(mod_black2)
##             MEDIAN_INCOME                       POP          WHITE_PER_CAPITA 
##                 16.501417                  2.107509                  9.480899 
##          BLACK_PER_CAPITA          ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO 
##                  6.996064                  6.399358                  4.655469 
##           POVERTY_PERCENT         UNEMPLOYMENT_RATE    SCHOOL_ENROLLMENT_RATE 
##                 13.298666                  3.688606                  2.298976 
##             BACHELOR_RATE 
##                  4.804371
mod_black_back2=step(mod_black2, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_black_back2)

summary(mod_black_back2)
## 
## Call:
## lm(formula = BLACK_HC_PER_CAPITA_POP ~ MEDIAN_INCOME + WHITE_PER_CAPITA + 
##     BLACK_PER_CAPITA + ASIAN_PER_CAPITA + POVERTY_PERCENT, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4842 -1.4136 -0.3367  0.9879 10.2929 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      -6.622e+01  2.181e+01  -3.036  0.00466 **
## MEDIAN_INCOME     3.000e-04  1.615e-04   1.858  0.07217 . 
## WHITE_PER_CAPITA  3.595e-01  1.209e-01   2.973  0.00547 **
## BLACK_PER_CAPITA  2.502e-01  1.055e-01   2.372  0.02370 * 
## ASIAN_PER_CAPITA  1.326e+00  4.040e-01   3.283  0.00243 **
## POVERTY_PERCENT   1.147e+00  5.038e-01   2.277  0.02937 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.723 on 33 degrees of freedom
## Multiple R-squared:  0.4504, Adjusted R-squared:  0.3672 
## F-statistic:  5.41 on 5 and 33 DF,  p-value: 0.0009665
vif(mod_black_back2)
##    MEDIAN_INCOME WHITE_PER_CAPITA BLACK_PER_CAPITA ASIAN_PER_CAPITA 
##         9.977125         6.922026         4.275537         3.757292 
##  POVERTY_PERCENT 
##         8.691638
#3. anti-asian hate crime
mod_asian <- lm(ASIAN_HC_PER_CAPITA_POP ~ ., data=train)
summary(mod_asian)
## 
## Call:
## lm(formula = ASIAN_HC_PER_CAPITA_POP ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64048 -0.05295  0.00483  0.07283  0.46802 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                7.507e-01  2.031e+00   0.370   0.7147  
## MEDIAN_INCOME              4.104e-06  1.583e-05   0.259   0.7974  
## POP                       -8.752e-09  8.524e-09  -1.027   0.3140  
## WHITE_PER_CAPITA          -7.944e-03  1.122e-02  -0.708   0.4853  
## BLACK_PER_CAPITA          -2.615e-03  1.026e-02  -0.255   0.8008  
## ASIAN_PER_CAPITA          -3.141e-02  4.486e-02  -0.700   0.4901  
## FOREIGN_BORN_NON_US_RATIO  3.351e-02  2.812e-02   1.192   0.2441  
## POVERTY_PERCENT            3.448e-03  5.104e-02   0.068   0.9467  
## UNEMPLOYMENT_RATE         -2.997e-02  5.207e-02  -0.575   0.5699  
## SCHOOL_ENROLLMENT_RATE    -1.119e-02  2.960e-02  -0.378   0.7084  
## BACHELOR_RATE              1.294e-03  2.061e-02   0.063   0.9504  
## ALL_HC_PER_CAPITA         -2.946e-02  3.040e-02  -0.969   0.3414  
## BLACK_HC_PER_CAPITA_POP    2.081e-01  7.928e-02   2.624   0.0143 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2006 on 26 degrees of freedom
## Multiple R-squared:  0.8826, Adjusted R-squared:  0.8285 
## F-statistic:  16.3 on 12 and 26 DF,  p-value: 3.856e-09
vif(mod_asian)
##             MEDIAN_INCOME                       POP          WHITE_PER_CAPITA 
##                 17.642846                  2.139721                 10.979608 
##          BLACK_PER_CAPITA          ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO 
##                  7.447525                  8.531338                  5.005225 
##           POVERTY_PERCENT         UNEMPLOYMENT_RATE    SCHOOL_ENROLLMENT_RATE 
##                 16.427579                  3.718994                  2.485430 
##             BACHELOR_RATE         ALL_HC_PER_CAPITA   BLACK_HC_PER_CAPITA_POP 
##                  5.208384                 72.894602                 69.498326
mod_asian_back=step(mod_asian, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_asian_back)

summary(mod_asian_back)
## 
## Call:
## lm(formula = ASIAN_HC_PER_CAPITA_POP ~ POP + FOREIGN_BORN_NON_US_RATIO + 
##     ALL_HC_PER_CAPITA + BLACK_HC_PER_CAPITA_POP, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66987 -0.05725  0.02294  0.06072  0.44670 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               -1.000e-01  5.848e-02  -1.711  0.09624 . 
## POP                       -8.618e-09  6.357e-09  -1.356  0.18413   
## FOREIGN_BORN_NON_US_RATIO  2.804e-02  1.413e-02   1.984  0.05542 . 
## ALL_HC_PER_CAPITA         -3.377e-02  2.440e-02  -1.384  0.17531   
## BLACK_HC_PER_CAPITA_POP    2.161e-01  6.517e-02   3.316  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.18 on 34 degrees of freedom
## Multiple R-squared:  0.8765, Adjusted R-squared:  0.862 
## F-statistic: 60.35 on 4 and 34 DF,  p-value: 5.722e-15
vif(mod_asian_back)
##                       POP FOREIGN_BORN_NON_US_RATIO         ALL_HC_PER_CAPITA 
##                  1.479044                  1.571992                 58.347647 
##   BLACK_HC_PER_CAPITA_POP 
##                 58.369345
#model without hate crime variables
mod_asian2 <- lm(ASIAN_HC_PER_CAPITA_POP ~ . -ALL_HC_PER_CAPITA-BLACK_HC_PER_CAPITA_POP, data=train)
summary(mod_asian2)
## 
## Call:
## lm(formula = ASIAN_HC_PER_CAPITA_POP ~ . - ALL_HC_PER_CAPITA - 
##     BLACK_HC_PER_CAPITA_POP, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49454 -0.23939 -0.00395  0.09092  1.71530 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               -7.193e+00  3.788e+00  -1.899   0.0679 .
## MEDIAN_INCOME              4.511e-05  3.303e-05   1.366   0.1829  
## POP                        7.520e-10  1.825e-08   0.041   0.9674  
## WHITE_PER_CAPITA           3.145e-02  2.250e-02   1.398   0.1732  
## BLACK_PER_CAPITA           2.311e-02  2.146e-02   1.077   0.2908  
## ASIAN_PER_CAPITA           1.434e-01  8.384e-02   1.711   0.0982 .
## FOREIGN_BORN_NON_US_RATIO  1.433e-02  5.851e-02   0.245   0.8083  
## POVERTY_PERCENT            1.695e-01  9.909e-02   1.711   0.0981 .
## UNEMPLOYMENT_RATE         -7.773e-02  1.119e-01  -0.695   0.4930  
## SCHOOL_ENROLLMENT_RATE    -2.057e-02  6.143e-02  -0.335   0.7402  
## BACHELOR_RATE              8.014e-03  4.271e-02   0.188   0.8525  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4329 on 28 degrees of freedom
## Multiple R-squared:  0.4116, Adjusted R-squared:  0.2014 
## F-statistic: 1.959 on 10 and 28 DF,  p-value: 0.07867
vif(mod_asian2)
##             MEDIAN_INCOME                       POP          WHITE_PER_CAPITA 
##                 16.501417                  2.107509                  9.480899 
##          BLACK_PER_CAPITA          ASIAN_PER_CAPITA FOREIGN_BORN_NON_US_RATIO 
##                  6.996064                  6.399358                  4.655469 
##           POVERTY_PERCENT         UNEMPLOYMENT_RATE    SCHOOL_ENROLLMENT_RATE 
##                 13.298666                  3.688606                  2.298976 
##             BACHELOR_RATE 
##                  4.804371
mod_asian_back2=step(mod_asian2, direction="backward", trace=0)
par(mfrow=c(2,2))
plot(mod_asian_back2)

summary(mod_asian_back2)
## 
## Call:
## lm(formula = ASIAN_HC_PER_CAPITA_POP ~ MEDIAN_INCOME + WHITE_PER_CAPITA + 
##     BLACK_PER_CAPITA + ASIAN_PER_CAPITA + POVERTY_PERCENT, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46558 -0.25852 -0.00099  0.16308  1.74991 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      -7.555e+00  3.236e+00  -2.334   0.0258 *
## MEDIAN_INCOME     3.961e-05  2.396e-05   1.653   0.1078  
## WHITE_PER_CAPITA  3.676e-02  1.794e-02   2.049   0.0485 *
## BLACK_PER_CAPITA  2.598e-02  1.565e-02   1.660   0.1064  
## ASIAN_PER_CAPITA  1.556e-01  5.994e-02   2.596   0.0140 *
## POVERTY_PERCENT   1.319e-01  7.474e-02   1.765   0.0869 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4039 on 33 degrees of freedom
## Multiple R-squared:  0.3963, Adjusted R-squared:  0.3048 
## F-statistic: 4.332 on 5 and 33 DF,  p-value: 0.003856
vif(mod_asian_back2)
##    MEDIAN_INCOME WHITE_PER_CAPITA BLACK_PER_CAPITA ASIAN_PER_CAPITA 
##         9.977125         6.922026         4.275537         3.757292 
##  POVERTY_PERCENT 
##         8.691638

3. Predictive Analysis: k nearest neighbors

set.seed(123)
#1. all racial hate crime
#Let caret search for best k
trctrl = trainControl(method = "cv", number = 10) #10-fold cv
simple_fit3= train(ALL_HC_PER_CAPITA~.,
                   data=train,
                   method = "knn",
                   trControl = trctrl,
                   tuneLength = 50)
plot(simple_fit3)

simple_fit3 
## k-Nearest Neighbors 
## 
## 39 samples
## 12 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 35, 35, 35, 35, 35, 35, ... 
## Resampling results across tuning parameters:
## 
##   k    RMSE      Rsquared   MAE     
##     5  6.322412  0.4237717  4.551636
##     7  6.420670  0.4056259  4.601590
##     9  6.607362  0.3613087  4.776129
##    11  6.307716  0.3844472  4.477374
##    13  6.180925  0.4416995  4.267641
##    15  6.152784  0.4458449  4.209575
##    17  6.156900  0.4387926  4.297701
##    19  6.010273  0.4613615  4.230657
##    21  5.953789  0.4649601  4.242796
##    23  5.963167  0.4642294  4.232403
##    25  6.044545  0.4504058  4.313037
##    27  6.479173  0.4084240  4.832036
##    29  6.933877  0.4108113  5.363944
##    31  7.064728  0.4432337  5.492342
##    33  7.050698  0.3983654  5.456585
##    35  7.065644  0.4098173  5.471668
##    37  7.062176  0.4098173  5.475572
##    39  7.062176  0.4098173  5.475572
##    41  7.062176  0.4098173  5.475572
##    43  7.062176  0.4098173  5.475572
##    45  7.062176  0.4098173  5.475572
##    47  7.062176  0.4098173  5.475572
##    49  7.062176  0.4098173  5.475572
##    51  7.062176  0.4098173  5.475572
##    53  7.062176  0.4098173  5.475572
##    55  7.062176  0.4098173  5.475572
##    57  7.062176  0.4098173  5.475572
##    59  7.062176  0.4098173  5.475572
##    61  7.062176  0.4098173  5.475572
##    63  7.062176  0.4098173  5.475572
##    65  7.062176  0.4098173  5.475572
##    67  7.062176  0.4098173  5.475572
##    69  7.062176  0.4098173  5.475572
##    71  7.062176  0.4098173  5.475572
##    73  7.062176  0.4098173  5.475572
##    75  7.062176  0.4098173  5.475572
##    77  7.062176  0.4098173  5.475572
##    79  7.062176  0.4098173  5.475572
##    81  7.062176  0.4098173  5.475572
##    83  7.062176  0.4098173  5.475572
##    85  7.062176  0.4098173  5.475572
##    87  7.062176  0.4098173  5.475572
##    89  7.062176  0.4098173  5.475572
##    91  7.062176  0.4098173  5.475572
##    93  7.062176  0.4098173  5.475572
##    95  7.062176  0.4098173  5.475572
##    97  7.062176  0.4098173  5.475572
##    99  7.062176  0.4098173  5.475572
##   101  7.062176  0.4098173  5.475572
##   103  7.062176  0.4098173  5.475572
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 21.
#2. anti-black hate crime
simple_fit4= train(BLACK_HC_PER_CAPITA_POP~.,
                   data=train,
                   method = "knn",
                   trControl = trctrl,
                   tuneLength = 50)
plot(simple_fit4)

simple_fit4
## k-Nearest Neighbors 
## 
## 39 samples
## 12 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 35, 35, 35, 35, 35, 35, ... 
## Resampling results across tuning parameters:
## 
##   k    RMSE      Rsquared   MAE     
##     5  2.644446  0.4748750  1.795815
##     7  2.422895  0.5830589  1.732380
##     9  2.360427  0.6349595  1.722547
##    11  2.253279  0.6551436  1.621244
##    13  2.166296  0.6180502  1.509124
##    15  2.173714  0.5877432  1.528599
##    17  2.151263  0.5910766  1.499530
##    19  2.156769  0.5669521  1.531792
##    21  2.127495  0.5865246  1.534167
##    23  2.136914  0.5696179  1.535365
##    25  2.126744  0.5911237  1.533855
##    27  2.288571  0.5784928  1.675899
##    29  2.458393  0.5898117  1.866354
##    31  2.501090  0.6271986  1.883699
##    33  2.494183  0.5065472  1.875172
##    35  2.503459  0.3695950  1.880769
##    37  2.502700  0.3695950  1.882446
##    39  2.502700  0.3695950  1.882446
##    41  2.502700  0.3695950  1.882446
##    43  2.502700  0.3695950  1.882446
##    45  2.502700  0.3695950  1.882446
##    47  2.502700  0.3695950  1.882446
##    49  2.502700  0.3695950  1.882446
##    51  2.502700  0.3695950  1.882446
##    53  2.502700  0.3695950  1.882446
##    55  2.502700  0.3695950  1.882446
##    57  2.502700  0.3695950  1.882446
##    59  2.502700  0.3695950  1.882446
##    61  2.502700  0.3695950  1.882446
##    63  2.502700  0.3695950  1.882446
##    65  2.502700  0.3695950  1.882446
##    67  2.502700  0.3695950  1.882446
##    69  2.502700  0.3695950  1.882446
##    71  2.502700  0.3695950  1.882446
##    73  2.502700  0.3695950  1.882446
##    75  2.502700  0.3695950  1.882446
##    77  2.502700  0.3695950  1.882446
##    79  2.502700  0.3695950  1.882446
##    81  2.502700  0.3695950  1.882446
##    83  2.502700  0.3695950  1.882446
##    85  2.502700  0.3695950  1.882446
##    87  2.502700  0.3695950  1.882446
##    89  2.502700  0.3695950  1.882446
##    91  2.502700  0.3695950  1.882446
##    93  2.502700  0.3695950  1.882446
##    95  2.502700  0.3695950  1.882446
##    97  2.502700  0.3695950  1.882446
##    99  2.502700  0.3695950  1.882446
##   101  2.502700  0.3695950  1.882446
##   103  2.502700  0.3695950  1.882446
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 25.
#3. anti-asian hate crime
simple_fit5= train(ASIAN_HC_PER_CAPITA_POP~.,
                   data=train,
                   method = "knn",
                   trControl = trctrl,
                   tuneLength = 50)
plot(simple_fit5)

simple_fit5
## k-Nearest Neighbors 
## 
## 39 samples
## 12 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 35, 35, 35, 35, 35, 35, ... 
## Resampling results across tuning parameters:
## 
##   k    RMSE       Rsquared   MAE      
##     5  0.3691158  0.5747832  0.2394554
##     7  0.3289058  0.6438099  0.2202018
##     9  0.3028478  0.6967459  0.2024725
##    11  0.2880529  0.6554908  0.2007762
##    13  0.2743234  0.6403317  0.1896897
##    15  0.2696925  0.5931313  0.1883349
##    17  0.2627104  0.5862949  0.1849407
##    19  0.2612330  0.5943251  0.1800576
##    21  0.2542633  0.5985920  0.1741475
##    23  0.2535920  0.5562669  0.1729369
##    25  0.2528757  0.5025716  0.1751082
##    27  0.2786101  0.5570465  0.2011020
##    29  0.2968264  0.5488665  0.2182805
##    31  0.3017404  0.5206127  0.2234350
##    33  0.3006905  0.4938958  0.2219397
##    35  0.3015118  0.4560191  0.2232957
##    37  0.3016995  0.4560191  0.2234949
##    39  0.3016995  0.4560191  0.2234949
##    41  0.3016995  0.4560191  0.2234949
##    43  0.3016995  0.4560191  0.2234949
##    45  0.3016995  0.4560191  0.2234949
##    47  0.3016995  0.4560191  0.2234949
##    49  0.3016995  0.4560191  0.2234949
##    51  0.3016995  0.4560191  0.2234949
##    53  0.3016995  0.4560191  0.2234949
##    55  0.3016995  0.4560191  0.2234949
##    57  0.3016995  0.4560191  0.2234949
##    59  0.3016995  0.4560191  0.2234949
##    61  0.3016995  0.4560191  0.2234949
##    63  0.3016995  0.4560191  0.2234949
##    65  0.3016995  0.4560191  0.2234949
##    67  0.3016995  0.4560191  0.2234949
##    69  0.3016995  0.4560191  0.2234949
##    71  0.3016995  0.4560191  0.2234949
##    73  0.3016995  0.4560191  0.2234949
##    75  0.3016995  0.4560191  0.2234949
##    77  0.3016995  0.4560191  0.2234949
##    79  0.3016995  0.4560191  0.2234949
##    81  0.3016995  0.4560191  0.2234949
##    83  0.3016995  0.4560191  0.2234949
##    85  0.3016995  0.4560191  0.2234949
##    87  0.3016995  0.4560191  0.2234949
##    89  0.3016995  0.4560191  0.2234949
##    91  0.3016995  0.4560191  0.2234949
##    93  0.3016995  0.4560191  0.2234949
##    95  0.3016995  0.4560191  0.2234949
##    97  0.3016995  0.4560191  0.2234949
##    99  0.3016995  0.4560191  0.2234949
##   101  0.3016995  0.4560191  0.2234949
##   103  0.3016995  0.4560191  0.2234949
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 25.

4. Cross validation using test data

4.1. Calculate RMSE and normalized RMSE

set.seed(123)
#1. all racial hate crime: stepwise multiple regression with 12 variables
test_pred4 = predict(mod_all_back, newdata = test)
test_pred_df4=as.data.frame(test_pred4)
RMSE_4=sqrt(mean((test$ALL_HC_PER_CAPITA - test_pred_df4$test_pred4)^2))
N_RMSE_4=RMSE_4/(max(test$ALL_HC_PER_CAPITA)-min(test$ALL_HC_PER_CAPITA))
round(RMSE_4 , digits = 3)
## [1] 1.156
round(N_RMSE_4, digits = 3)
## [1] 0.037
#2. all racial hate crime: stepwise multiple regression with 10 variables
test_pred4.2 = predict(mod_all_back2, newdata = test)
test_pred_df4.2=as.data.frame(test_pred4.2)
RMSE_4.2=sqrt(mean((test$ALL_HC_PER_CAPITA - test_pred_df4.2$test_pred4.2)^2)) 
N_RMSE_4.2=RMSE_4.2/(max(test$ALL_HC_PER_CAPITA)-min(test$ALL_HC_PER_CAPITA))
round(RMSE_4.2 , digits = 3)
## [1] 5.95
round(N_RMSE_4.2 , digits = 3)
## [1] 0.191
#3. anti-black hate crime: stepwise multiple regression with 12 variables
test_pred5 = predict(mod_black_back, newdata = test)
test_pred_df5=as.data.frame(test_pred5)
RMSE_5=sqrt(mean((test$BLACK_HC_PER_CAPITA_POP - test_pred_df5$test_pred5)^2)) 
N_RMSE_5=RMSE_5/(max(test$BLACK_HC_PER_CAPITA_POP)-min(test$BLACK_HC_PER_CAPITA_POP))
round(RMSE_5 , digits = 3)
## [1] 0.49
round(N_RMSE_5 , digits = 3) 
## [1] 0.04
#4. anti-black hate crime: stepwise multiple regression with 10 variables
test_pred5.2 = predict(mod_black_back2, newdata = test)
test_pred_df5.2=as.data.frame(test_pred5.2)
RMSE_5.2=sqrt(mean((test$BLACK_HC_PER_CAPITA_POP - test_pred_df5.2$test_pred5)^2)) 
N_RMSE_5.2=RMSE_5.2/(max(test$BLACK_HC_PER_CAPITA_POP)-min(test$BLACK_HC_PER_CAPITA_POP))
round(RMSE_5.2 , digits = 3)
## [1] 8.784
round(N_RMSE_5.2 , digits = 3)
## [1] 0.715
#5. anti-asian hate crime: stepwise multiple regression with 12 variables
test_pred6 = predict(mod_asian_back, newdata = test)
test_pred_df6=as.data.frame(test_pred6)
RMSE_6=sqrt(mean((test$ASIAN_HC_PER_CAPITA_POP - test_pred_df6$test_pred6)^2)) 
N_RMSE_6=RMSE_6/(max(test$ASIAN_HC_PER_CAPITA_POP)-min(test$ASIAN_HC_PER_CAPITA_POP))
round(RMSE_6 , digits = 3)
## [1] 0.164
round(N_RMSE_6 , digits = 3) 
## [1] 0.082
#6. anti-asian hate crime: stepwise multiple regression with 10 variables
test_pred6.2 = predict(mod_asian_back2, newdata = test)
test_pred_df6.2=as.data.frame(test_pred6.2)
RMSE_6.2=sqrt(mean((test$ASIAN_HC_PER_CAPITA_POP - test_pred_df6.2$test_pred6)^2)) 
N_RMSE_6.2=RMSE_6.2/(max(test$ASIAN_HC_PER_CAPITA_POP)-min(test$ASIAN_HC_PER_CAPITA_POP))
round(RMSE_6.2 , digits = 3)
## [1] 1.203
round(N_RMSE_6.2 , digits = 3)
## [1] 0.599
#7. all racial hate crime: knn with 12 variables
test_pred = predict(simple_fit3, newdata = test)
test_pred_df=as.data.frame(test_pred)
RMSE_1=sqrt(mean((test$ALL_HC_PER_CAPITA - test_pred_df$test_pred)^2)) 
N_RMSE_1=RMSE_1/(max(test$ALL_HC_PER_CAPITA)-min(test$ALL_HC_PER_CAPITA))
round(RMSE_1, digits = 3)
## [1] 8.099
round(N_RMSE_1, digits = 3)
## [1] 0.261
#8. anti-black hate crime: knn with 12 variables
test_pred2 = predict(simple_fit4, newdata = test)
test_pred_df2=as.data.frame(test_pred2)
RMSE_2=sqrt(mean((test$BLACK_HC_PER_CAPITA_POP - test_pred_df2$test_pred2)^2)) 
N_RMSE_2=RMSE_2/(max(test$BLACK_HC_PER_CAPITA_POP)-min(test$BLACK_HC_PER_CAPITA_POP))
round(RMSE_2 , digits = 3)
## [1] 3.539
round(N_RMSE_2 , digits = 3)
## [1] 0.288
#9. anti-asian hate crime: knn with 12 variables
test_pred3 = predict(simple_fit5, newdata = test) 
test_pred_df3=as.data.frame(test_pred3) 
RMSE_3=sqrt(mean((test$ASIAN_HC_PER_CAPITA_POP - test_pred_df3$test_pred3)^2)) 
N_RMSE_3=RMSE_3/(max(test$ASIAN_HC_PER_CAPITA_POP)-min(test$ASIAN_HC_PER_CAPITA_POP))
round(RMSE_3 , digits = 3)
## [1] 0.56
round(N_RMSE_3 , digits = 3)
## [1] 0.279

4.2. Plot predicted (using knn) vs. actual value

t1<-data.frame(hate_crime=test$ALL_HC_PER_CAPITA, state=as.factor(c(4,5,8,11,16,20,21,24,31,32,50)), value=c(rep("actual", length(test$ALL_HC_PER_CAPITA))))
t2<-data.frame(hate_crime=test_pred_df$test_pred, state=as.factor(c(4,5,8,11,16,20,21,24,31,32,50)), value=c(rep("predicted", length(test$ALL_HC_PER_CAPITA))))
total<-rbind(t1, t2)

ggplot(data=total, aes(x=state, y=hate_crime, fill=value)) +
     geom_bar(stat="identity", color="black", position=position_dodge())+
     theme_minimal()+scale_fill_manual(values=c(culer[1],rgb(1,0,0,.6)))+ ggtitle("Actual vs. Predicted: All Racial Hate Crime") +
  xlab("state (numerical index)") + ylab("number of racial hate crime")

t3<-data.frame(hate_crime=test$BLACK_HC_PER_CAPITA_POP, state=as.factor(c(4,5,8,11,16,20,21,24,31,32,50)), value=c(rep("actual", length(test$BLACK_HC_PER_CAPITA_POP))))
t4<-data.frame(hate_crime=test_pred_df2$test_pred, state=as.factor(c(4,5,8,11,16,20,21,24,31,32,50)), value=c(rep("predicted", length(test$BLACK_HC_PER_CAPITA_POP))))
total<-rbind(t3, t4)

ggplot(data=total, aes(x=state, y=hate_crime, fill=value)) +
     geom_bar(stat="identity", color="black", position=position_dodge())+
     theme_minimal()+scale_fill_manual(values=c(culer[2],rgb(0,0,1,.6)))+ ggtitle("Actual vs. Predicted: Anti-Black Hate Crime") +
  xlab("state (numerical index)") + ylab("number of racial hate crime")

t5<-data.frame(hate_crime=test$ASIAN_HC_PER_CAPITA_POP, state=as.factor(c(4,5,8,11,16,20,21,24,31,32,50)), value=c(rep("actual", length(test$ASIAN_HC_PER_CAPITA_POP))))
t6<-data.frame(hate_crime=test_pred_df3$test_pred, state=as.factor(c(4,5,8,11,16,20,21,24,31,32,50)), value=c(rep("predicted", length(test$ASIAN_HC_PER_CAPITA_POP))))
total<-rbind(t5, t6)

ggplot(data=total, aes(x=state, y=hate_crime, fill=value)) +
     geom_bar(stat="identity", color="black", position=position_dodge())+
     theme_minimal()+scale_fill_manual(values=c(rgb(0,0.8,0.1,.6),rgb(0,0.5,0.2,.6)))+ ggtitle("Actual vs. Predicted: Anti-Asian Hate Crime") +
  xlab("state (numerical index)") + ylab("number of racial hate crime")